import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_excel("C:/Users/PREDATOR/Downloads/Online Retail.xlsx")
data.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 541909 entries, 0 to 541908 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 541909 non-null object 1 StockCode 541909 non-null object 2 Description 540455 non-null object 3 Quantity 541909 non-null int64 4 InvoiceDate 541909 non-null datetime64[ns] 5 UnitPrice 541909 non-null float64 6 CustomerID 406829 non-null float64 7 Country 541909 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(4) memory usage: 33.1+ MB
data.describe()
| Quantity | UnitPrice | CustomerID | |
|---|---|---|---|
| count | 541909.000000 | 541909.000000 | 406829.000000 |
| mean | 9.552250 | 4.611114 | 15287.690570 |
| std | 218.081158 | 96.759853 | 1713.600303 |
| min | -80995.000000 | -11062.060000 | 12346.000000 |
| 25% | 1.000000 | 1.250000 | 13953.000000 |
| 50% | 3.000000 | 2.080000 | 15152.000000 |
| 75% | 10.000000 | 4.130000 | 16791.000000 |
| max | 80995.000000 | 38970.000000 | 18287.000000 |
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['Date'] = data['InvoiceDate'].dt.date
data['Time'] = data['InvoiceDate'].dt.time
data['Date'] = data['Date'].astype(str)
data['Time'] = data['Time'].astype(str)
#data['Date'] = data['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))
#data['Time'] = data['Date'].apply(lambda x: x.strftime('%H:%M:%S'))
print(data[['Time', 'Date']])
Time Date 0 08:26:00 2010-12-01 1 08:26:00 2010-12-01 2 08:26:00 2010-12-01 3 08:26:00 2010-12-01 4 08:26:00 2010-12-01 ... ... ... 541904 12:50:00 2011-12-09 541905 12:50:00 2011-12-09 541906 12:50:00 2011-12-09 541907 12:50:00 2011-12-09 541908 12:50:00 2011-12-09 [541909 rows x 2 columns]
data.isna().sum()
InvoiceNo 0 StockCode 0 Description 1454 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 135080 Country 0 Date 0 Time 0 dtype: int64
data['CustomerID'].nunique()
4372
data = data.dropna()
data.isna().sum()
InvoiceNo 0 StockCode 0 Description 0 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 0 Country 0 Date 0 Time 0 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 406829 entries, 0 to 541908 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 406829 non-null object 1 StockCode 406829 non-null object 2 Description 406829 non-null object 3 Quantity 406829 non-null int64 4 InvoiceDate 406829 non-null datetime64[ns] 5 UnitPrice 406829 non-null float64 6 CustomerID 406829 non-null float64 7 Country 406829 non-null object 8 Date 406829 non-null object 9 Time 406829 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(6) memory usage: 34.1+ MB
# Only select numeric columns for boxplot
numeric_columns = data.select_dtypes(include=['number']).columns
# Since we have 6 graphs, set to 8 grids
nrows = 1
ncols = 3
# Set up subplot
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(8, 5))
fig.suptitle('Box Plots for Customer Data Columns (Numerical Values Only)')
# Plot each numeric column's data in a separate subplot
axes = axes.ravel()
for i, column in enumerate(numeric_columns):
data.boxplot(column=column, ax=axes[i])
axes[i].set_title(column)
axes[i].set_xticklabels([])
plt.subplots_adjust(top=0.95)
plt.tight_layout()
plt.show()
Since quantity cannot be negative, hence there are few outliers in the data which we can see here
# interquartile method for removing outliers
for col in numeric_columns:
Q1 = data[col].quantile(0.25)
Q3 = data[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
# Set up subplot
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(8, 5))
fig.suptitle('Box Plots for Customer Data Columns')
# Plot each numeric column's data in a separate subplot
axes = axes.ravel()
for i, column in enumerate(numeric_columns):
data.boxplot(column=column, ax=axes[i])
axes[i].set_title(column)
axes[i].set_xticklabels([])
plt.subplots_adjust(top=0.95)
plt.tight_layout()
plt.show()
# Counts the unique number of invoices per customer per country. The result will be a column named Total_Invoices
# Sums the quantity of items purchased per customer per country. This is the total number of items a customer has purchased.
# Sums up the unit price for all items bought per customer per country,
# giving a rough total spend by the customer. This assumes that the unit price is not the total price for each invoice but the price per unit quantity.
# Calculate total number of invoices and total quantity for each CustomerID
customer_stats = data.groupby(['CustomerID', 'Country']).agg(Total_Invoices=('InvoiceNo', 'nunique'),
Total_Quantity=('Quantity', 'sum'),
Total_Spend=('UnitPrice', 'sum')).reset_index()
# It normalizes the number of purchases by the highest number of purchases made by any single customer, giving a relative frequency between 0 and 1.
# creates a new column that calculates the average spend per invoice for each customer.
# Calculate purchase frequency and average spend per customer per country
customer_stats['Avg_Purchase_Frequency'] = customer_stats['Total_Invoices'] / customer_stats['Total_Invoices'].max()
customer_stats['Avg_Spend'] = customer_stats['Total_Spend'] / customer_stats['Total_Invoices']
import plotly.express as px
# For better visualization, we summarize the data at the country level.
country_stats = customer_stats.groupby('Country').agg(
Avg_Spend=('Avg_Spend', 'mean'),
Avg_Purchase_Frequency=('Avg_Purchase_Frequency', 'mean'),
Total_Spend=('Total_Spend', 'sum')
).reset_index()
# Create the bubble chart
fig = px.scatter(
country_stats,
x='Avg_Purchase_Frequency',
y='Avg_Spend',
size='Total_Spend', # adjust the bubble size
color='Country', # Each country will have a different color
hover_name='Country',
log_x=True, # Log scale for x-axis
log_y=True, # Log scale for y-axis
title='Customer Loyalty and Purchasing Frequency by Country',
labels={'Avg_Purchase_Frequency': 'Average Purchase Frequency', 'Avg_Spend': 'Average Spend'},
size_max=60
)
fig.update_layout(
xaxis_title="Average Purchase Frequency (Log Scale)",
yaxis_title="Average Spend (Log Scale)"
)
# Customize the hover text
fig.update_traces(
hovertemplate="<br>".join([
"Country: %{hovertext}",
"Average Spend: %{y}",
"Average Purchase Frequency: %{x}",
"Total Spend: %{marker.size:,}"
])
)
fig.show()
# Calculate the total quantity sold per StockCode
total_sales = data.groupby('StockCode').agg({'Quantity': 'sum'}).reset_index()
# Sort the data to identify the top 10 products
top_10 = total_sales.nlargest(10, 'Quantity')
combined_sales = pd.concat([top_10])
# Generate the bar chart with adjusted bar width and category axis type
fig = px.bar(combined_sales, x='StockCode', y='Quantity',
title='Top 10 Selling Products',
text='Quantity',
category_orders={"StockCode": combined_sales['StockCode'].tolist()},
width=800, height=600)
fig.update_layout(
xaxis_title="Stock Code",
yaxis_title="Total Quantity Sold",
xaxis_type='category', # Treating the StockCode as a categorical variable
xaxis_tickangle=-45,
uniformtext_minsize=8,
uniformtext_mode='hide',
bargap=0.15,
)
fig.update_traces(width=0.4)
# Show the figure
fig.show()
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Create features for clustering
product_features = data.groupby('StockCode').agg(
Total_Sales_Volume=('Quantity', 'sum'),
Purchase_Frequency=('InvoiceNo', 'nunique'),
Median_UnitPrice=('UnitPrice', 'median') # Using the median as a representative unit price
).reset_index()
# Normalize the features using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(product_features[['Total_Sales_Volume', 'Purchase_Frequency', 'Median_UnitPrice']])
# Applying KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Adjust n_clusters based on your data
clusters = kmeans.fit_predict(scaled_features)
# Attach the cluster labels to your original product features DataFrame
product_features['Cluster'] = clusters
fig = px.scatter(product_features, x='Total_Sales_Volume', y='Median_UnitPrice', color='Cluster',
title='Product Clusters Based on StockCode')
fig.show()
There are clusters that contains products with a lower average price and higher sales volume, which might indicate more commonly purchased items. On the other hand, there are clusters with a higher average price but lower sales volume, which might represent premium products.
# Using the previously defined 'customer_stats'
customer_stats['Loyalty_Score'] = customer_stats['Total_Quantity'] * customer_stats['Avg_Spend']
top_10_loyal_customers = customer_stats.sort_values(by='Loyalty_Score', ascending=False).head(10)
import plotly.express as px
# CustomerID column is of type string for better display
top_10_loyal_customers['CustomerID'] = top_10_loyal_customers['CustomerID'].astype(str)
# Create the bar chart with a color scale
fig = px.bar(top_10_loyal_customers.sort_values('Loyalty_Score', ascending=True),
y='CustomerID', x='Loyalty_Score',
color='Loyalty_Score',
color_continuous_scale=px.colors.sequential.Viridis,
title='Top 10 Most Loyal Customers')
# Improve the layout and design
fig.update_layout(
xaxis_title='Loyalty Score',
yaxis_title='Customer ID',
coloraxis_colorbar=dict(
title='Loyalty Score'
),
yaxis={'categoryorder':'total ascending'}
)
# Customize the hover text
fig.update_traces(hovertemplate="<br>".join([
"CustomerID: %{y}",
"Loyalty Score: %{x}"
]))
fig.show()
# Converting InvoiceDate to datetime and extract month
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['InvoiceMonth'] = data['InvoiceDate'].dt.to_period('M')
# Aggregate sales by month
monthly_sales = data.groupby('InvoiceMonth').agg(Total_Sales=('Quantity', 'sum')).reset_index()
# Identify peak demand months (months with sales above the 75th percentile)
peak_demand_months = monthly_sales[monthly_sales['Total_Sales'] > monthly_sales['Total_Sales'].quantile(0.75)]
import plotly.express as px
# Convert Period to string for plotly
monthly_sales['InvoiceMonth'] = monthly_sales['InvoiceMonth'].astype(str)
fig = px.line(monthly_sales, x='InvoiceMonth', y='Total_Sales', title='Monthly Sales to Identify Peak Demand Periods')
fig.update_xaxes(type='category')
fig.show()
# Extract day from InvoiceDate
data['InvoiceDay'] = data['InvoiceDate'].dt.to_period('D')
# Aggregate sales by day
daily_sales = data.groupby('InvoiceDay').agg(Total_Sales=('Quantity', 'sum')).reset_index()
# Detecting anomalies could be as simple as finding days with sales beyond a threshold
threshold = daily_sales['Total_Sales'].quantile(0.95) # Sales above this threshold may be considered as increased business activity
high_activity_days = daily_sales[daily_sales['Total_Sales'] > threshold]
# Convert Period to string for plotly
daily_sales['InvoiceDay'] = daily_sales['InvoiceDay'].astype(str)
high_activity_days['InvoiceDay'] = high_activity_days['InvoiceDay'].astype(str)
# create the plot
fig = px.line(daily_sales, x='InvoiceDay', y='Total_Sales', title='Daily Sales Fluctuations')
fig.add_scatter(x=high_activity_days['InvoiceDay'], y=high_activity_days['Total_Sales'], mode='markers', name='High Activity')
fig.show()
C:\Users\PREDATOR\AppData\Local\Temp\ipykernel_4612\3574997093.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy